{
  "nbformat": 4,
  "nbformat_minor": 0,
  "metadata": {
    "colab": {
      "name": "demo-chinese-text-binary-classification-with-bert.ipynb",
      "version": "0.3.2",
      "provenance": [],
      "collapsed_sections": [],
      "include_colab_link": true
    },
    "kernelspec": {
      "name": "python3",
      "display_name": "Python 3"
    },
    "accelerator": "GPU"
  },
  "cells": [
    {
      "cell_type": "markdown",
      "metadata": {
        "id": "view-in-github",
        "colab_type": "text"
      },
      "source": [
        "<a href=\"https://colab.research.google.com/github/wshuyi/demo-chinese-text-binary-classification-with-bert/blob/master/demo_chinese_text_binary_classification_with_bert.ipynb\" target=\"_parent\"><img src=\"https://colab.research.google.com/assets/colab-badge.svg\" alt=\"Open In Colab\"/></a>"
      ]
    },
    {
      "metadata": {
        "id": "GBbAMqZ6IZuw",
        "colab_type": "text"
      },
      "cell_type": "markdown",
      "source": [
        "base code borrowed from [this Google Colab Notebook](https://colab.research.google.com/github/google-research/bert/blob/master/predicting_movie_reviews_with_bert_on_tf_hub.ipynb).\n",
        "\n",
        "Refactored by [Shuyi Wang](https://www.linkedin.com/in/shuyi-wang-b3955026/)\n",
        "\n",
        "Please refer to [this Medium Article](https://medium.com/@wshuyi/how-to-do-text-binary-classification-with-bert-f1348a25d905) for the tutorial on how to classify English text data.\n",
        "\n"
      ]
    },
    {
      "metadata": {
        "id": "jviywGyWyKsA",
        "colab_type": "code",
        "trusted": true,
        "colab": {}
      },
      "cell_type": "code",
      "source": [
        "!pip install bert-tensorflow"
      ],
      "execution_count": 0,
      "outputs": []
    },
    {
      "metadata": {
        "id": "hsZvic2YxnTz",
        "colab_type": "code",
        "trusted": true,
        "colab": {}
      },
      "cell_type": "code",
      "source": [
        "import pandas as pd\n",
        "import tensorflow as tf\n",
        "import tensorflow_hub as hub\n",
        "import pickle\n",
        "import bert\n",
        "from bert import run_classifier\n",
        "from bert import optimization\n",
        "from bert import tokenization"
      ],
      "execution_count": 0,
      "outputs": []
    },
    {
      "metadata": {
        "colab_type": "code",
        "id": "NZM71PjIOF_I",
        "colab": {}
      },
      "cell_type": "code",
      "source": [
        "def pretty_print(result):\n",
        "    df = pd.DataFrame([result]).T\n",
        "    df.columns = [\"values\"]\n",
        "    return df"
      ],
      "execution_count": 0,
      "outputs": []
    },
    {
      "metadata": {
        "trusted": true,
        "id": "ZCYOL8HbIZu2",
        "colab_type": "code",
        "colab": {}
      },
      "cell_type": "code",
      "source": [
        "def create_tokenizer_from_hub_module(bert_model_hub):\n",
        "  \"\"\"Get the vocab file and casing info from the Hub module.\"\"\"\n",
        "  with tf.Graph().as_default():\n",
        "    bert_module = hub.Module(bert_model_hub)\n",
        "    tokenization_info = bert_module(signature=\"tokenization_info\", as_dict=True)\n",
        "    with tf.Session() as sess:\n",
        "      vocab_file, do_lower_case = sess.run([tokenization_info[\"vocab_file\"],\n",
        "                                            tokenization_info[\"do_lower_case\"]])\n",
        "      \n",
        "  return bert.tokenization.FullTokenizer(\n",
        "      vocab_file=vocab_file, do_lower_case=do_lower_case)\n",
        "\n",
        "def make_features(dataset, label_list, MAX_SEQ_LENGTH, tokenizer, DATA_COLUMN, LABEL_COLUMN):\n",
        "    input_example = dataset.apply(lambda x: bert.run_classifier.InputExample(guid=None, \n",
        "                                                                   text_a = x[DATA_COLUMN], \n",
        "                                                                   text_b = None, \n",
        "                                                                   label = x[LABEL_COLUMN]), axis = 1)\n",
        "    features = bert.run_classifier.convert_examples_to_features(input_example, label_list, MAX_SEQ_LENGTH, tokenizer)\n",
        "    return features\n",
        "\n",
        "def create_model(bert_model_hub, is_predicting, input_ids, input_mask, segment_ids, labels,\n",
        "                 num_labels):\n",
        "  \"\"\"Creates a classification model.\"\"\"\n",
        "\n",
        "  bert_module = hub.Module(\n",
        "      bert_model_hub,\n",
        "      trainable=True)\n",
        "  bert_inputs = dict(\n",
        "      input_ids=input_ids,\n",
        "      input_mask=input_mask,\n",
        "      segment_ids=segment_ids)\n",
        "  bert_outputs = bert_module(\n",
        "      inputs=bert_inputs,\n",
        "      signature=\"tokens\",\n",
        "      as_dict=True)\n",
        "\n",
        "  # Use \"pooled_output\" for classification tasks on an entire sentence.\n",
        "  # Use \"sequence_outputs\" for token-level output.\n",
        "  output_layer = bert_outputs[\"pooled_output\"]\n",
        "\n",
        "  hidden_size = output_layer.shape[-1].value\n",
        "\n",
        "  # Create our own layer to tune for politeness data.\n",
        "  output_weights = tf.get_variable(\n",
        "      \"output_weights\", [num_labels, hidden_size],\n",
        "      initializer=tf.truncated_normal_initializer(stddev=0.02))\n",
        "\n",
        "  output_bias = tf.get_variable(\n",
        "      \"output_bias\", [num_labels], initializer=tf.zeros_initializer())\n",
        "\n",
        "  with tf.variable_scope(\"loss\"):\n",
        "\n",
        "    # Dropout helps prevent overfitting\n",
        "    output_layer = tf.nn.dropout(output_layer, keep_prob=0.9)\n",
        "\n",
        "    logits = tf.matmul(output_layer, output_weights, transpose_b=True)\n",
        "    logits = tf.nn.bias_add(logits, output_bias)\n",
        "    log_probs = tf.nn.log_softmax(logits, axis=-1)\n",
        "\n",
        "    # Convert labels into one-hot encoding\n",
        "    one_hot_labels = tf.one_hot(labels, depth=num_labels, dtype=tf.float32)\n",
        "\n",
        "    predicted_labels = tf.squeeze(tf.argmax(log_probs, axis=-1, output_type=tf.int32))\n",
        "    # If we're predicting, we want predicted labels and the probabiltiies.\n",
        "    if is_predicting:\n",
        "      return (predicted_labels, log_probs)\n",
        "\n",
        "    # If we're train/eval, compute loss between predicted and actual label\n",
        "    per_example_loss = -tf.reduce_sum(one_hot_labels * log_probs, axis=-1)\n",
        "    loss = tf.reduce_mean(per_example_loss)\n",
        "    return (loss, predicted_labels, log_probs)\n",
        "\n",
        "# model_fn_builder actually creates our model function\n",
        "# using the passed parameters for num_labels, learning_rate, etc.\n",
        "def model_fn_builder(bert_model_hub, num_labels, learning_rate, num_train_steps,\n",
        "                     num_warmup_steps):\n",
        "  \"\"\"Returns `model_fn` closure for TPUEstimator.\"\"\"\n",
        "  def model_fn(features, labels, mode, params):  # pylint: disable=unused-argument\n",
        "    \"\"\"The `model_fn` for TPUEstimator.\"\"\"\n",
        "\n",
        "    input_ids = features[\"input_ids\"]\n",
        "    input_mask = features[\"input_mask\"]\n",
        "    segment_ids = features[\"segment_ids\"]\n",
        "    label_ids = features[\"label_ids\"]\n",
        "\n",
        "    is_predicting = (mode == tf.estimator.ModeKeys.PREDICT)\n",
        "    \n",
        "    # TRAIN and EVAL\n",
        "    if not is_predicting:\n",
        "\n",
        "      (loss, predicted_labels, log_probs) = create_model(\n",
        "        bert_model_hub, is_predicting, input_ids, input_mask, segment_ids, label_ids, num_labels)\n",
        "\n",
        "      train_op = bert.optimization.create_optimizer(\n",
        "          loss, learning_rate, num_train_steps, num_warmup_steps, use_tpu=False)\n",
        "\n",
        "      # Calculate evaluation metrics. \n",
        "      def metric_fn(label_ids, predicted_labels):\n",
        "        accuracy = tf.metrics.accuracy(label_ids, predicted_labels)\n",
        "        f1_score = tf.contrib.metrics.f1_score(\n",
        "            label_ids,\n",
        "            predicted_labels)\n",
        "        auc = tf.metrics.auc(\n",
        "            label_ids,\n",
        "            predicted_labels)\n",
        "        recall = tf.metrics.recall(\n",
        "            label_ids,\n",
        "            predicted_labels)\n",
        "        precision = tf.metrics.precision(\n",
        "            label_ids,\n",
        "            predicted_labels) \n",
        "        true_pos = tf.metrics.true_positives(\n",
        "            label_ids,\n",
        "            predicted_labels)\n",
        "        true_neg = tf.metrics.true_negatives(\n",
        "            label_ids,\n",
        "            predicted_labels)   \n",
        "        false_pos = tf.metrics.false_positives(\n",
        "            label_ids,\n",
        "            predicted_labels)  \n",
        "        false_neg = tf.metrics.false_negatives(\n",
        "            label_ids,\n",
        "            predicted_labels)\n",
        "        return {\n",
        "            \"eval_accuracy\": accuracy,\n",
        "            \"f1_score\": f1_score,\n",
        "            \"auc\": auc,\n",
        "            \"precision\": precision,\n",
        "            \"recall\": recall,\n",
        "            \"true_positives\": true_pos,\n",
        "            \"true_negatives\": true_neg,\n",
        "            \"false_positives\": false_pos,\n",
        "            \"false_negatives\": false_neg\n",
        "        }\n",
        "\n",
        "      eval_metrics = metric_fn(label_ids, predicted_labels)\n",
        "\n",
        "      if mode == tf.estimator.ModeKeys.TRAIN:\n",
        "        return tf.estimator.EstimatorSpec(mode=mode,\n",
        "          loss=loss,\n",
        "          train_op=train_op)\n",
        "      else:\n",
        "          return tf.estimator.EstimatorSpec(mode=mode,\n",
        "            loss=loss,\n",
        "            eval_metric_ops=eval_metrics)\n",
        "    else:\n",
        "      (predicted_labels, log_probs) = create_model(\n",
        "        bert_model_hub, is_predicting, input_ids, input_mask, segment_ids, label_ids, num_labels)\n",
        "\n",
        "      predictions = {\n",
        "          'probabilities': log_probs,\n",
        "          'labels': predicted_labels\n",
        "      }\n",
        "      return tf.estimator.EstimatorSpec(mode, predictions=predictions)\n",
        "\n",
        "  # Return the actual model function in the closure\n",
        "  return model_fn\n",
        "\n",
        "def estimator_builder(bert_model_hub, OUTPUT_DIR, SAVE_SUMMARY_STEPS, SAVE_CHECKPOINTS_STEPS, label_list, LEARNING_RATE, num_train_steps, num_warmup_steps, BATCH_SIZE):\n",
        "\n",
        "    # Specify outpit directory and number of checkpoint steps to save\n",
        "    run_config = tf.estimator.RunConfig(\n",
        "        model_dir=OUTPUT_DIR,\n",
        "        save_summary_steps=SAVE_SUMMARY_STEPS,\n",
        "        save_checkpoints_steps=SAVE_CHECKPOINTS_STEPS)\n",
        "\n",
        "    model_fn = model_fn_builder(\n",
        "      bert_model_hub = bert_model_hub,\n",
        "      num_labels=len(label_list),\n",
        "      learning_rate=LEARNING_RATE,\n",
        "      num_train_steps=num_train_steps,\n",
        "      num_warmup_steps=num_warmup_steps)\n",
        "\n",
        "    estimator = tf.estimator.Estimator(\n",
        "      model_fn=model_fn,\n",
        "      config=run_config,\n",
        "      params={\"batch_size\": BATCH_SIZE})\n",
        "    return estimator, model_fn, run_config\n"
      ],
      "execution_count": 0,
      "outputs": []
    },
    {
      "metadata": {
        "id": "IuMOGwFui4it",
        "colab_type": "code",
        "trusted": true,
        "colab": {}
      },
      "cell_type": "code",
      "source": [
        "def run_on_dfs(train, test, DATA_COLUMN, LABEL_COLUMN, \n",
        "               MAX_SEQ_LENGTH = 128,\n",
        "              BATCH_SIZE = 32,\n",
        "              LEARNING_RATE = 2e-5,\n",
        "              NUM_TRAIN_EPOCHS = 3.0,\n",
        "              WARMUP_PROPORTION = 0.1,\n",
        "              SAVE_SUMMARY_STEPS = 100,\n",
        "               SAVE_CHECKPOINTS_STEPS = 10000,\n",
        "              bert_model_hub = \"https://tfhub.dev/google/bert_uncased_L-12_H-768_A-12/1\"):\n",
        "\n",
        "    label_list = train[LABEL_COLUMN].unique().tolist()\n",
        "    \n",
        "    tokenizer = create_tokenizer_from_hub_module(bert_model_hub)\n",
        "\n",
        "    train_features = make_features(train, label_list, MAX_SEQ_LENGTH, tokenizer, DATA_COLUMN, LABEL_COLUMN)\n",
        "    test_features = make_features(test, label_list, MAX_SEQ_LENGTH, tokenizer, DATA_COLUMN, LABEL_COLUMN)\n",
        "\n",
        "    num_train_steps = int(len(train_features) / BATCH_SIZE * NUM_TRAIN_EPOCHS)\n",
        "    num_warmup_steps = int(num_train_steps * WARMUP_PROPORTION)\n",
        "\n",
        "    estimator, model_fn, run_config = estimator_builder(\n",
        "                                  bert_model_hub, \n",
        "                                  OUTPUT_DIR, \n",
        "                                  SAVE_SUMMARY_STEPS, \n",
        "                                  SAVE_CHECKPOINTS_STEPS, \n",
        "                                  label_list, \n",
        "                                  LEARNING_RATE, \n",
        "                                  num_train_steps, \n",
        "                                  num_warmup_steps, \n",
        "                                  BATCH_SIZE)\n",
        "\n",
        "    train_input_fn = bert.run_classifier.input_fn_builder(\n",
        "        features=train_features,\n",
        "        seq_length=MAX_SEQ_LENGTH,\n",
        "        is_training=True,\n",
        "        drop_remainder=False)\n",
        "\n",
        "    estimator.train(input_fn=train_input_fn, max_steps=num_train_steps)\n",
        "\n",
        "    test_input_fn = run_classifier.input_fn_builder(\n",
        "        features=test_features,\n",
        "        seq_length=MAX_SEQ_LENGTH,\n",
        "        is_training=False,\n",
        "        drop_remainder=False)\n",
        "\n",
        "    result_dict = estimator.evaluate(input_fn=test_input_fn, steps=None)\n",
        "    return result_dict, estimator\n",
        "    "
      ],
      "execution_count": 0,
      "outputs": []
    },
    {
      "metadata": {
        "trusted": true,
        "id": "wpUHTA8LIZu_",
        "colab_type": "code",
        "colab": {}
      },
      "cell_type": "code",
      "source": [
        "import random\n",
        "random.seed(10)"
      ],
      "execution_count": 0,
      "outputs": []
    },
    {
      "metadata": {
        "trusted": true,
        "id": "qmP8MBvjIZvB",
        "colab_type": "code",
        "colab": {}
      },
      "cell_type": "code",
      "source": [
        "OUTPUT_DIR = 'output'"
      ],
      "execution_count": 0,
      "outputs": []
    },
    {
      "metadata": {
        "id": "UV3ExI2JJMhd",
        "colab_type": "text"
      },
      "cell_type": "markdown",
      "source": [
        "----- you just need to focus from here ------"
      ]
    },
    {
      "metadata": {
        "id": "XSUjVoFtJVEO",
        "colab_type": "text"
      },
      "cell_type": "markdown",
      "source": [
        "## Get your data"
      ]
    },
    {
      "metadata": {
        "id": "pe6HHONmOwEs",
        "colab_type": "code",
        "colab": {}
      },
      "cell_type": "code",
      "source": [
        "!wget https://github.com/wshuyi/demo-chinese-text-binary-classification-with-bert/raw/master/dianping_train_test.pickle"
      ],
      "execution_count": 0,
      "outputs": []
    },
    {
      "metadata": {
        "trusted": true,
        "id": "XFBwDlmnIZvD",
        "colab_type": "code",
        "colab": {}
      },
      "cell_type": "code",
      "source": [
        "with open(\"dianping_train_test.pickle\", 'rb') as f:\n",
        "    train, test = pickle.load(f)"
      ],
      "execution_count": 0,
      "outputs": []
    },
    {
      "metadata": {
        "trusted": true,
        "id": "Tju0c4dqIZvK",
        "colab_type": "code",
        "colab": {}
      },
      "cell_type": "code",
      "source": [
        "train = train.sample(len(train))"
      ],
      "execution_count": 0,
      "outputs": []
    },
    {
      "metadata": {
        "id": "p9BAxyKhKirc",
        "colab_type": "code",
        "colab": {}
      },
      "cell_type": "code",
      "source": [
        "train.head()"
      ],
      "execution_count": 0,
      "outputs": []
    },
    {
      "metadata": {
        "id": "hM8M7k0gKk5H",
        "colab_type": "code",
        "colab": {}
      },
      "cell_type": "code",
      "source": [
        "myparam = {\n",
        "        \"DATA_COLUMN\": \"comment\",\n",
        "        \"LABEL_COLUMN\": \"sentiment\",\n",
        "        \"LEARNING_RATE\": 2e-5,\n",
        "        \"NUM_TRAIN_EPOCHS\":3,\n",
        "        \"bert_model_hub\":\"https://tfhub.dev/google/bert_chinese_L-12_H-768_A-12/1\"\n",
        "    }"
      ],
      "execution_count": 0,
      "outputs": []
    },
    {
      "metadata": {
        "colab_type": "code",
        "id": "Dg2apeXpMqG-",
        "colab": {}
      },
      "cell_type": "code",
      "source": [
        "result, estimator = run_on_dfs(train, test, **myparam)"
      ],
      "execution_count": 0,
      "outputs": []
    },
    {
      "metadata": {
        "id": "YLOvtveTMqwp",
        "colab_type": "code",
        "colab": {}
      },
      "cell_type": "code",
      "source": [
        "pretty_print(result)"
      ],
      "execution_count": 0,
      "outputs": []
    },
    {
      "metadata": {
        "id": "D0OPTDcGMsJw",
        "colab_type": "code",
        "colab": {}
      },
      "cell_type": "code",
      "source": [
        ""
      ],
      "execution_count": 0,
      "outputs": []
    }
  ]
}